This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
If you are viewing it in a browser (HTML file), the original code that can be found and executed in the associated .Rmd file.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
# Used for calculating AUC and more
# library(mltools)
# library(data.table)
# SOM Support libaries
library(kohonen)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(ggplot2)
library(sp)
library(maptools)
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired by the end of 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
library(reshape2)
library(rgeos)
## rgeos version: 0.5-9, (SVN revision 684)
## GEOS runtime version: 3.9.1-CAPI-1.14.2
## Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
## GEOS using OverlayNG
## Linking to sp version: 1.4-6
## Polygon checking: TRUE
set.seed(10)
# ONLY IF RUNNING NATIVELY, ELSE EXECUTE NEXT BLOCK IN R NOTEBOOK
# setwd("~/OneDrive/University/INFO411_DataMining/Project/RProject/Project4/DataSet")
### Helping Functions
binningSpam <- function(value){
if (value> 0.5){
return(factor("Spam"))
}
else{
return(factor("NonSpam"))
}
}
binningSpamNum <- function(value){
if (value> 0.5){
return(1)
}
else{
return(1)
}
}
We’ll import in the following data sets: - trainRAW - The labels we’ll use for training that shows which host IDs are “Spam”, “not Spam”, or “Undecided” - testRAW - The labels we’ll use for the testing that shows which host IDs are “Spam”, “not Spam”, or “Undecided” - linkRAW - Domains and their associated linked attributes. - The data here is not normalized, not transformed. - linkTransfromedRAW - Domains and their associated linked attributes. - The data here is normalized via Log(10), and has multiple transformations performed and added as new columns - For example, there are columns where the PageRank of the Homepage is divided by the next linked PageRank
trainRAW <- data.frame(read.csv("webspam-uk2007-set1-1-10_TRAINING/WEBSPAM-UK2007-SET1-labels.txt", header= F, sep=" ", dec="."))
testRAW <- data.frame(read.csv("webspam-uk2007-set2-1-10_TEST/WEBSPAM-UK2007-SET2-labels.txt", header= F, sep=" ", dec="."))
linkRAW <- data.frame(read.csv("uk-2007-05.link_based_features.csv", header=T))
linkTransfromedRAW <- data.frame(read.csv("uk-2007-05.link_based_features_transformed.csv", header=T))
# Original Link Train DS
linkTrain <- merge(trainRAW,linkRAW,by.x = "V1", by.y="X.hostid")
linkTrain <- linkTrain[,c(-1,-2,-4,-5)]
linkTrain$V3 <- as.numeric(as.character(linkTrain$V3))
## Warning: NAs introduced by coercion
linkTrain <- na.omit(linkTrain)
# Correlation for the original Train DS. Correlation is not strong at all.
corTable <- abs(cor(linkTrain,y=linkTrain$V3))
corTable = corTable[order(corTable, decreasing = T),,drop=F]
head(corTable,20)
## [,1]
## V3 1.00000000
## truncatedpagerank_4_hp 0.06409890
## truncatedpagerank_1_hp 0.06373734
## truncatedpagerank_3_hp 0.06356978
## truncatedpagerank_2_hp 0.06344003
## pagerank_hp 0.06308755
## pagerank_mp 0.05911198
## truncatedpagerank_4_mp 0.05338514
## truncatedpagerank_3_mp 0.05316211
## truncatedpagerank_2_mp 0.05280514
## truncatedpagerank_1_mp 0.05162838
## siteneighbors_1_hp 0.05042254
## prsigma_hp 0.04911449
## siteneighbors_4_hp 0.04650915
## eq_hp_mp 0.04435796
## assortativity_mp 0.04260146
## siteneighbors_3_hp 0.03862848
## siteneighbors_4_mp 0.03682804
## prsigma_mp 0.03482937
## siteneighbors_3_mp 0.03445648
From a quick glance, we can see that none of the data has any strong correlation with the result (at least without transformation).
Let’s take a look at the other dataset.
# Loading the Train DS that has been logged
linkTransformedTrain <- merge(trainRAW,linkTransfromedRAW,by.x = "V1", by.y="X.hostid")
linkTransformedTrain <- linkTransformedTrain[,c(-1,-2,-4)]
linkTransformedTrain$V3 <- as.numeric(as.character(linkTransformedTrain$V3))
## Warning: NAs introduced by coercion
linkTransformedTrain <- na.omit(linkTransformedTrain)
# Correlation table.
corTable2 <- abs(cor(linkTransformedTrain,y=linkTransformedTrain$V3))
corTable2 = corTable2[order(corTable2, decreasing = T),,drop=F]
head(corTable2,21)
## [,1]
## V3 1.0000000
## log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_ 0.1515353
## log_OP_outdegree_mp_div_pagerank_mp_CP_ 0.1505800
## log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_ 0.1488675
## L_outdegree_mp 0.1481146
## L_avgin_of_out_mp 0.1446143
## log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_ 0.1435711
## log_OP_outdegree_hp_div_pagerank_hp_CP_ 0.1432103
## log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_ 0.1422249
## log_OP_truncatedpagerank_4_mp_div_pagerank_mp_CP_ 0.1409383
## L_outdegree_hp 0.1404630
## L_avgin_of_out_hp 0.1395381
## log_OP_avgin_of_out_hp_mul_outdegree_hp_CP_ 0.1359523
## log_OP_min_OP_truncatedpagerank_2_hp_div_truncatedpagerank_1_hp_truncatedpagerank_3_hp_div_truncatedpagerank_2_hp_truncatedpagerank_4_hp_div_truncatedpagerank_3_hp_CP__CP_ 0.1189911
## L_prsigma_hp 0.1188065
## log_OP_truncatedpagerank_4_hp_div_pagerank_hp_CP_ 0.1178506
## log_OP_truncatedpagerank_3_hp_div_pagerank_hp_CP_ 0.1177164
## log_OP_truncatedpagerank_2_hp_div_pagerank_hp_CP_ 0.1172431
## log_OP_prsigma_hp_div_pagerank_hp_CP_ 0.1168947
## log_OP_truncatedpagerank_1_hp_div_pagerank_hp_CP_ 0.1123874
## L_prsigma_mp 0.1097668
headhead <- head(corTable2,21)
headnames <- row.names(headhead)
headnames <- headnames[2:11]
headnames20 <- row.names(headhead)[2:21]
sum(linkTransformedTrain$V3 <0.5)
## [1] 3776
sum(linkTransformedTrain$V3 > 0.5)
## [1] 222
Looking at the correlation of the corTable, we find a lot more attributes that have stronger correlation.
We also see that the distribution of Spam to Non-Spam is very skewed, thus we’ll try to balance the dataset before visualizing the top 10 correlated.
We’ll utilize an oversampling technique to try to ensure we have enough samples for training. (2x the amount of “Spam” we have)
### OVERSAMPLING and splitting
# Undecided (0.5) are dropped
# I tried undersampling... no distinct change
linkTransformedTrainSpam <- subset(linkTransformedTrain, V3 > 0.5)
linkTransformedTrainNotSpam <- subset(linkTransformedTrain, V3 < 0.5)
linkTransformedTrainSpam <- linkTransformedTrainSpam[sample(1:nrow(linkTransformedTrainSpam), size=444, replace=T),]
linkTransformedTrainNotSpam <- linkTransformedTrainNotSpam[sample(1:nrow(linkTransformedTrainNotSpam), size=444, replace=F),]
linkTransformedBalanced = rbind(linkTransformedTrainSpam, linkTransformedTrainNotSpam)
linkTransformedBalanced$binnedY <- sapply(linkTransformedBalanced$V3, binningSpam)
#linkTransformedTest$binnedY <- sapply(linkTransformedTest$V3, binningSpam)
plot(linkTransformedBalanced$log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_outdegree_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$L_outdegree_mp,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$L_avgin_of_out_mp,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_truncatedpagerank_4_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$L_avgin_of_out_hp,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_outdegree_hp_div_pagerank_hp_CP_,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$L_outdegree_hp,linkTransformedBalanced$binnedY)
plot(linkTransformedBalanced$log_OP_avgin_of_out_hp_mul_outdegree_hp_CP_,linkTransformedBalanced$binnedY)
We can can see that on some charts that there are a few instances
such a L_outdegree_mp there are a few instances on both
spam and non-spam where it sits at -50. Let’s take a look at what those
instances are and how many of them are spam and non-spam
sum(linkTransformedBalanced[linkTransformedBalanced$L_outdegree_mp < -40,]$V3 > 0.5)
## [1] 198
sum(linkTransformedBalanced[linkTransformedBalanced$L_outdegree_mp < -40,]$V3 < 0.5)
## [1] 75
There seems to be a skew of 1 value compared to the rest, as such, we’ll leave it in as it could be helpful for our algorithms later.
Let’s visualise it with an SOM to see how well defined clusters are.
#Colour palette definition
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
# ------------------- SOM TRAINING ---------------------------
{
#choose the variables with which to train the SOM
#the following selects column 2,4,5,8
data_train <- linkTransformedBalanced[,headhead]
# now train the SOM using the Kohonen method
data_train_matrix <- as.matrix(scale(data_train))
names(data_train_matrix) <- names(data_train)
require(kohonen)
x_dim=15
y_dim=15
som_grid <- somgrid(xdim = x_dim, ydim=y_dim, topo="hexagonal")
# Train the SOM model!
if (packageVersion("kohonen") < 3){
system.time(som_model <- som(data_train_matrix,
grid=som_grid,
rlen=1000,
alpha=c(0.9,0.01),
n.hood = "circular",
keep.data = TRUE ))
}else{
system.time(som_model <- som(data_train_matrix,
grid=som_grid,
rlen=1000,
alpha=c(0.9,0.01),
mode="online",
normalizeDataLayers=false,
keep.data = TRUE ))
}
plot(som_model, type = "changes")
#counts within nodes
plot(som_model, type = "counts", main="Node Counts")
#map quality
plot(som_model, type = "quality", main="Node Quality/Distance")
#neighbour distances
plot(som_model, type="dist.neighbours", main = "SOM neighbour distances", palette.name=grey.colors)
#code spread
plot(som_model, type = "codes")
plotHeatMap <- function(som_model, data, variable=0){
# Plot a heatmap for any variable from the data set "data".
# If variable is 0, an interactive window will be provided to choose the variable.
# If not, the variable in "variable" will be plotted.
require(dummies)
require(kohonen)
interactive <- TRUE
while (interactive == TRUE){
if (variable == 0){
#show interactive window.
color_by_var <- select.list(names(data), multiple=FALSE,
graphics=TRUE,
title="Choose variable to color map by.")
# check for user finished.
if (color_by_var == ""){ # if user presses Cancel - we quit function
return(TRUE)
}
interactive <- TRUE
color_variable <- data.frame(data[, color_by_var])
} else {
color_variable <- data.frame(data[, variable])
color_by_var <- names(data)[variable]
interactive <- FALSE
}
#if the variable chosen is a string or factor -
#Get the levels and ask the user to choose which one they'd like.
if (class(color_variable[,1]) %in% c("character", "factor", "logical")){
#want to spread this out into dummy factors - but colour by one of those.
temp_data <- dummy.data.frame(color_variable, sep="_")
# print(temp_data)
# chosen_factor <- select.list(names(temp_data),
# multiple=FALSE,
# graphics=TRUE,
# title="Choose level of variable for colouring")
# print(chosen_factor)
chosen_factor <- "data...variable._Spam"
color_variable <- temp_data[, chosen_factor]
rm(temp_data, chosen_factor)
color_by <- color_variable
} else {
#impute the missing values with the mean.
color_variable[is.na(color_variable[,1]),1] <- mean(color_variable[,1], na.rm=TRUE)
#color_by <- capVector(color_variable[,1])
#color_by <- scale(color_by)
color_by <- color_variable[,1]
}
unit_colors <- aggregate(color_by, by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)
plot(som_model, type = "property", property=unit_colors[,2], main=color_by_var)
}
}
plotHeatMap(som_model, linkTransformedBalanced, variable=140)
}
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of replacement
## length
While there are clusters that are formed from our data, the clusters do not have distinct Spam or non-spam results, thus will likely result in our classification model not working well. We’ll proceed to try to classify, but before then we’ll split our data and prepare our testing data
# Loading some libraries here because if I load them before SOM it bugs out
# MLP, ROC
library(RSNNS)
## Loading required package: Rcpp
##
## Attaching package: 'RSNNS'
## The following object is masked from 'package:kohonen':
##
## som
# DT
library(rpart)
#
# Forest
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
#
# Naive Bayes
library(e1071)
linkTransformedBalancedSelected <- linkTransformedBalanced[,c(headnames,"binnedY")]
split1<- sample(c(rep(0, 0.7 * nrow(linkTransformedBalancedSelected)), rep(1, 0.3 * nrow(linkTransformedBalancedSelected))))
trainDS <- linkTransformedBalancedSelected[split1 == 0, ]
testDS <- linkTransformedBalancedSelected[split1 == 1, ]
# FOR LATER
linkTransformedBalancedTrain <- linkTransformedBalanced[split1 == 0, ]
linkTransformedBalancedTest <- linkTransformedBalanced[split1 == 1, ]
linkTransformedTest <-merge(testRAW,linkTransfromedRAW,by.x = "V1", by.y="X.hostid")
linkTransformedTest <- linkTransformedTest[,c(-1,-2,-4)]
linkTransformedTest$V3 <- as.numeric(as.character(linkTransformedTest$V3))
## Warning: NAs introduced by coercion
linkTransformedTest <- na.omit(linkTransformedTest)
linkTransformedTest$binnedY <- sapply(linkTransformedTest$V3, binningSpam)
In attempt to improve our RF Model, we will try a different set of attributes:
The following will be attempted - Using Top 20 Correlated - Whole Data set (with parameter adjustment through rfcv) - Top 20 most important (based on what we got from the Whole Data Set) - Once hyper parameters are fine tuned, training on the whole train DS and using that on our test DS
After more research was done on RF, it was noticed that attribute selection may not be as important for RF, as it will typically use gini index, infomation gain, or some other splitting algorithm to figure out which variable to use. Thus, with fine tuning, we could in theory just shove the whole DS into a Forest and have it work
First, we would want to know how much attributes we should use first.
We can use rfcv() to figure out which one works best.
linkTransformedBalanced$binnedY <- sapply(linkTransformedBalanced$V3, binningSpam)
linkTransformedBalancedY <- linkTransformedBalanced$binnedY
linkTransformedBalancedX <- subset(linkTransformedBalanced,select=-c(binnedY,V3))
linkTransformedTrain$binnedY <- sapply(linkTransformedTrain$V3, binningSpam)
linkTransformedTrainY <- linkTransformedTrain$binnedY
linkTransformedTrainX <- subset(linkTransformedTrain,select=-c(binnedY,V3))
result <- rfcv(linkTransformedBalancedX,linkTransformedBalancedY, recursive = T)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
result <- rfcv(linkTransformedTrainX,linkTransformedTrainY, recursive = T)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
From our results, we can see that if we were to train on our 70% split, 10-15 would be better. But if we were to test on our full training data, 5 would be better
We can then attempt to brute force again to find how big should our tree go
linkTransformedBalancedNoFactor <- subset(linkTransformedBalancedTrain, select=-c(V3))
trainDS.all <- subset(linkTransformedBalancedTrain, select=-c(V3))
testDS.all = linkTransformedBalancedTest
# DO NOT RUN AS EXECUTION CAN BE SLOW
# {
# totalAttempts = data.frame(row.names=c("ntree","mtry","accuracy","precision","recall"))
#
# for (ntree in seq(10,200, by=10)){
# for (mtry in c(2:20)){
# forest <- randomForest(binnedY ~ ., data = trainDS.all, nodesize=1, ntree=ntree, mtry=mtry)
# forestPredict <- predict(forest, testDS.all, type="class")
# tempTable <- table(testDS.all$binnedY, forestPredict)
# TP <- tempTable[1,1]
# FP <- tempTable[2,1]
# FN <- tempTable[1,2]
# TN <- tempTable[2,2]
#
# accuracy <- (TP+TN)/(TP+TN+FP+FN)
# precision <- (TP/(TP+FP))
# recall <- (TP/(TP+FN))
#
# totalAttempts <- rbind(totalAttempts,
# data.frame(ntree=ntree,
# mtry=mtry,
# accuracy=accuracy,
# precision = precision,
# recall=recall))
# }
# }
# }
The “best” attributes based on our testing above was the following: - node size 1 - amount of trees to use: 120 (makes sense, as even though many places recommend a high number, we have little samples) - attributes to be used: 13
forest.All <- randomForest(binnedY ~ ., data = trainDS.all, nodesize=1, ntree=120, mtry=13)
plot(forest.All)
forestPredict.All <- predict(forest.All, testDS.all, type="class")
tempTable.All <- table(testDS.all$binnedY, forestPredict.All)
tempTable.All
## forestPredict.All
## Spam NonSpam
## Spam 126 19
## NonSpam 20 101
plotROC(as.integer(testDS.all$binnedY == "Spam"), as.integer(forestPredict.All == "Spam"))
auc_roc(as.integer(testDS.all$binnedY == "Spam"), as.integer(forestPredict.All == "Spam"))
## [1] 0.8523402
We got 85.2%. Not much better or worse sadly
forestPredict.All.Testing <- predict(forest.All, linkTransformedTest, type="class")
table(linkTransformedTest$binnedY, forestPredict.All.Testing)
## forestPredict.All.Testing
## Spam NonSpam
## NonSpam 326 1660
## Spam 53 69
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.Testing == "Spam"))
auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.Testing == "Spam"))
## [1] 0.5499671
We got a higher AUC area of 55%, but nothing fantastic.
Because we just might have too little data, we are going to attempt to use the parameters we set and train it on all the data we have and see if it nets us a better result
trainDS.all.all <- subset(linkTransformedBalanced, select=-c(V3))
forest.All.All <- randomForest(binnedY ~ ., data = trainDS.all.all, nodesize=1, ntree=120, mtry=13)
plot(forest.All.All)
forestPredict.All.All.Testing <- predict(forest.All.All, linkTransformedTest, type="class")
table(linkTransformedTest$binnedY, forestPredict.All.All.Testing)
## forestPredict.All.All.Testing
## Spam NonSpam
## NonSpam 287 1699
## Spam 46 76
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
## [1] 0.5476606
There is a tad amount of improvement, but nothing major either.
This might seem a bit redundant, but just as a check we tried training it using what the gini split thought was “Important” based on the training DS above.
importanceRankAll <- data.frame(importance(forest.All))
importanceRankAllNames <- row.names(importanceRankAll)[order(importanceRankAll, decreasing=T)]
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
importanceRankAllNames.Top20 <- importanceRankAllNames[1:21]
linkTransformedBalanced.AllSelected <- linkTransformedBalanced[,c(importanceRankAllNames.Top20,"binnedY")]
forestAll2 <- randomForest(binnedY ~ ., data = linkTransformedBalanced.AllSelected, nodesize=1, ntree=50, mtry=10)
plot(forestAll2)
forestPredict.top20important <- predict(forestAll2, linkTransformedTest, type="class")
tempTable.top20important <- table(linkTransformedTest$binnedY, forestPredict.top20important)
tempTable.top20important
## forestPredict.top20important
## Spam NonSpam
## NonSpam 380 1606
## Spam 62 60
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.top20important == "Spam"))
auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.top20important == "Spam"))
## [1] 0.5521285
As expected, we got a very similar score compared to training on the whole DS. Makes sense as it will converge at a point.
We tried training the system without balancing the data set (thus a lot more spam), and it didn’t work.
linkTransformedTrain.noBalance <- subset(linkTransformedTrain, select=-c(V3))
forest.NoBalanceAll <- randomForest(binnedY ~ ., data = linkTransformedTrain.noBalance, nodesize=1, ntree=2000, mtry=13)
forestPredict.NoBalanceAll <- predict(forest.NoBalanceAll, linkTransformedTest, type="class")
tempTable.NoBalanceAll <- table(linkTransformedTest$binnedY, forestPredict.NoBalanceAll)
tempTable.NoBalanceAll
## forestPredict.NoBalanceAll
## NonSpam Spam
## NonSpam 1981 5
## Spam 115 7
The input data to a Forest must be at least somewhat balanced for it to work well.
As our attribute selection did not seem to help our models, we tried a different way to reduce the dimensions in hope of finding better clusters and patterns for our models to pickup using PCA.
As such, for our next test, we decided to PCA the whole DS and pass the new attributes to our models.
Due to time constraints, we were unable to selectively PCA specific columns instead of all the attributes at once. This will probably heavily affect our Naive Bayes model.
linkTransformedTrain.X <- subset(linkTransformedTrain,select=-c(binnedY,V3))
linkTransformedTrain.PCAParams <- prcomp(linkTransformedTrain.X, center = TRUE, scale = TRUE)
summary(linkTransformedTrain.PCAParams)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 6.5918 4.3415 3.06635 2.66024 2.43142 2.27430 2.25803
## Proportion of Variance 0.3149 0.1366 0.06813 0.05128 0.04284 0.03748 0.03695
## Cumulative Proportion 0.3149 0.4515 0.51958 0.57086 0.61370 0.65118 0.68813
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.88302 1.86017 1.70913 1.63723 1.51857 1.4860 1.38040
## Proportion of Variance 0.02569 0.02507 0.02117 0.01942 0.01671 0.0160 0.01381
## Cumulative Proportion 0.71382 0.73890 0.76006 0.77949 0.79620 0.8122 0.82601
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 1.36310 1.30581 1.26299 1.22541 1.13701 1.08186 1.02919
## Proportion of Variance 0.01346 0.01236 0.01156 0.01088 0.00937 0.00848 0.00768
## Cumulative Proportion 0.83947 0.85183 0.86339 0.87427 0.88364 0.89212 0.89979
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.98603 0.95545 0.93173 0.90426 0.87994 0.82567 0.81727
## Proportion of Variance 0.00705 0.00662 0.00629 0.00593 0.00561 0.00494 0.00484
## Cumulative Proportion 0.90684 0.91346 0.91975 0.92567 0.93128 0.93622 0.94106
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.79319 0.75156 0.67924 0.65971 0.6439 0.63329 0.61288
## Proportion of Variance 0.00456 0.00409 0.00334 0.00315 0.0030 0.00291 0.00272
## Cumulative Proportion 0.94562 0.94971 0.95306 0.95621 0.9592 0.96212 0.96484
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.59415 0.57943 0.56250 0.54235 0.53902 0.53239 0.50770
## Proportion of Variance 0.00256 0.00243 0.00229 0.00213 0.00211 0.00205 0.00187
## Cumulative Proportion 0.96740 0.96983 0.97213 0.97426 0.97636 0.97842 0.98029
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.48273 0.46353 0.4556 0.44416 0.41394 0.38126 0.37694
## Proportion of Variance 0.00169 0.00156 0.0015 0.00143 0.00124 0.00105 0.00103
## Cumulative Proportion 0.98197 0.98353 0.9850 0.98647 0.98771 0.98876 0.98979
## PC50 PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 0.35993 0.35001 0.34516 0.33563 0.31415 0.30306 0.29881
## Proportion of Variance 0.00094 0.00089 0.00086 0.00082 0.00072 0.00067 0.00065
## Cumulative Proportion 0.99073 0.99162 0.99248 0.99330 0.99401 0.99468 0.99532
## PC57 PC58 PC59 PC60 PC61 PC62 PC63
## Standard deviation 0.27676 0.26684 0.25105 0.24432 0.23228 0.22231 0.20528
## Proportion of Variance 0.00056 0.00052 0.00046 0.00043 0.00039 0.00036 0.00031
## Cumulative Proportion 0.99588 0.99639 0.99685 0.99728 0.99767 0.99803 0.99834
## PC64 PC65 PC66 PC67 PC68 PC69 PC70
## Standard deviation 0.19499 0.18493 0.17233 0.15592 0.12342 0.11258 0.11147
## Proportion of Variance 0.00028 0.00025 0.00022 0.00018 0.00011 0.00009 0.00009
## Cumulative Proportion 0.99861 0.99886 0.99908 0.99925 0.99936 0.99946 0.99955
## PC71 PC72 PC73 PC74 PC75 PC76 PC77
## Standard deviation 0.10584 0.09951 0.09206 0.07959 0.06897 0.06478 0.05348
## Proportion of Variance 0.00008 0.00007 0.00006 0.00005 0.00003 0.00003 0.00002
## Cumulative Proportion 0.99963 0.99970 0.99976 0.99981 0.99984 0.99987 0.99989
## PC78 PC79 PC80 PC81 PC82 PC83 PC84
## Standard deviation 0.05020 0.04994 0.04347 0.04008 0.03852 0.03527 0.03083
## Proportion of Variance 0.00002 0.00002 0.00001 0.00001 0.00001 0.00001 0.00001
## Cumulative Proportion 0.99991 0.99993 0.99994 0.99995 0.99996 0.99997 0.99998
## PC85 PC86 PC87 PC88 PC89 PC90 PC91
## Standard deviation 0.02876 0.02384 0.0203 0.01758 0.01492 0.01376 0.01023
## Proportion of Variance 0.00001 0.00000 0.0000 0.00000 0.00000 0.00000 0.00000
## Cumulative Proportion 0.99999 0.99999 1.0000 0.99999 1.00000 1.00000 1.00000
## PC92 PC93 PC94 PC95 PC96 PC97
## Standard deviation 0.008623 0.005907 0.005509 0.005442 0.004364 0.003708
## Proportion of Variance 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## Cumulative Proportion 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
## PC98 PC99 PC100 PC101 PC102 PC103
## Standard deviation 0.003044 0.002405 0.002306 0.001149 5.371e-14 2.335e-14
## Proportion of Variance 0.000000 0.000000 0.000000 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion 1.000000 1.000000 1.000000 1.000000 1.000e+00 1.000e+00
## PC104 PC105 PC106 PC107 PC108
## Standard deviation 1.592e-14 1.529e-14 1.282e-14 9.225e-15 9.009e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC109 PC110 PC111 PC112 PC113
## Standard deviation 7.932e-15 6.829e-15 6.678e-15 6.279e-15 5.686e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC114 PC115 PC116 PC117 PC118
## Standard deviation 5.294e-15 4.613e-15 4.159e-15 4.099e-15 3.918e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC119 PC120 PC121 PC122 PC123
## Standard deviation 3.88e-15 3.388e-15 3.068e-15 2.757e-15 2.258e-15
## Proportion of Variance 0.00e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.00e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC124 PC125 PC126 PC127 PC128
## Standard deviation 2.154e-15 1.87e-15 1.803e-15 1.776e-15 1.558e-15
## Proportion of Variance 0.000e+00 0.00e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.00e+00 1.000e+00 1.000e+00 1.000e+00
## PC129 PC130 PC131 PC132 PC133
## Standard deviation 1.427e-15 1.406e-15 1.375e-15 1.327e-15 1.173e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC134 PC135 PC136 PC137 PC138
## Standard deviation 9.589e-16 8.874e-16 8.296e-16 6.298e-16 4.909e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
We decided to use all the way to PCA 13 to capture ~80% of the variance in our data
linkTransformedTrain.PCA <- data.frame(linkTransformedTrain$binnedY ,linkTransformedTrain.PCAParams$x)[,c(1:14)]
names(linkTransformedTrain.PCA)[1] <- "binnedY"
# TRANSFORMING PCA TEST
linkTransformedTest.x <- subset(linkTransformedTest,select=-c(binnedY,V3))
linkTransformedTest.PCA <- scale(linkTransformedTest.x) %*% linkTransformedTrain.PCAParams$rotation
linkTransformedTest.PCA <- data.frame(linkTransformedTest$binnedY, linkTransformedTest.PCA)
names(linkTransformedTest.PCA)[1] <- "binnedY"
We can then proceed with the same methods we used, but instead we’ll be passing in all the columns
# Creating Balanced DS, shuffling
{
linkTransformedTrainSpam.PCA <- subset(linkTransformedTrain.PCA, binnedY == "Spam")
linkTransformedTrainSpam.PCA <- rbind(linkTransformedTrainSpam.PCA,linkTransformedTrainSpam.PCA)
linkTransformedTrainNotSpam.PCA <- subset(linkTransformedTrain.PCA, binnedY == "NonSpam")
linkTransformedTrainNotSpam.PCA <- linkTransformedTrainNotSpam.PCA[sample(1:nrow(linkTransformedTrainNotSpam.PCA), size=nrow(linkTransformedTrainSpam.PCA), replace=F),]
linkTransformedBalanced.PCA = rbind(linkTransformedTrainSpam.PCA, linkTransformedTrainNotSpam.PCA)
}
# Shuffle
rows <- sample(nrow(linkTransformedBalanced.PCA))
linkTransformedBalanced.PCA <- linkTransformedBalanced.PCA[rows,]
split1<- sample(c(rep(0, 0.7 * nrow(linkTransformedBalanced.PCA)),
rep(1, 0.3 * nrow(linkTransformedBalanced.PCA))))
linkTransformedBalancedTrain.PCA <- linkTransformedBalanced.PCA[split1==0,]
linkTransformedBalancedTest.PCA <- linkTransformedBalanced.PCA[split1==1,]
library(RSNNS)
binningSpam <- function(value){
if (value> 0.5){
return(factor("Spam"))
}
else{
return(factor("NonSpam"))
}
}
trainValues <- linkTransformedBalanced.PCA
trainValues$binnedY = NULL
trainTargets <- decodeClassLabels(linkTransformedBalanced.PCA[,"binnedY"])
trainset <- splitForTrainingAndTest(trainValues, trainTargets, ratio=0.2)
trainset <- normTrainingAndTestSet(trainset)
model <- mlp(trainset$inputsTrain, trainset$targetsTrain, size=c(20), learnFuncParams=c(0.001), maxit=4000, inputsTest=trainset$inputsTest, targetsTest=trainset$targetsTest)
predictTestSet <- predict(model,trainset$inputsTest)
confusionMatrix(trainset$targetsTrain,fitted.values(model))
## predictions
## targets 1 2
## 1 267 86
## 2 87 270
confusionMatrix(trainset$targetsTest,predictTestSet)
## predictions
## targets 1 2
## 1 60 31
## 2 27 60
par(mar=c(5.1,4.1,4.1,2.1))
par(mfrow=c(2,2))
plotIterativeError(model)
plotRegressionError(predictTestSet[,2], trainset$targetsTest[,2])
plotROC(fitted.values(model)[,2], trainset$targetsTrain[,2])
plotROC(predictTestSet[,2], trainset$targetsTest[,2])
predictTestSet <- predict(model,linkTransformedTest.PCA[2:14]) # IF ADJUSTING LATER
confusionMatrix(linkTransformedTest.PCA$binnedY,predictTestSet)
## predictions
## targets 1 2
## 1 1037 949
## 2 27 95
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), predictTestSet[,2])
auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), predictTestSet[,2])
## [1] 0.5235126
MLP now works! But our results aren’t exactly fantastic with a AUC of
52.3%
Gini.DT.rpart.PCA <- rpart(binnedY ~ ., data = linkTransformedBalancedTrain.PCA, parms=list(split = "gini"), control =list(maxdepth = 7))
GiniDTPredict.PCA <- predict(Gini.DT.rpart.PCA, linkTransformedBalancedTest.PCA, type="class")
table(linkTransformedBalancedTest.PCA$binnedY, GiniDTPredict.PCA)
## GiniDTPredict.PCA
## NonSpam Spam
## NonSpam 89 34
## Spam 52 91
plotROC(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.PCA == "Spam"))
auc_roc(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.PCA == "Spam"))
## [1] 0.6796028
GiniDTPredict.Final.PCA <- predict(Gini.DT.rpart.PCA, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, GiniDTPredict.Final.PCA)
## GiniDTPredict.Final.PCA
## NonSpam Spam
## NonSpam 1470 516
## Spam 51 71
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.Final.PCA == "Spam"))
auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.Final.PCA == "Spam"))
## [1] 0.5437117
print(Gini.DT.rpart)
## n= 622
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 622 299 NonSpam (0.48070740 0.51929260)
## 2) log_OP_outdegree_mp_div_pagerank_mp_CP_< 14.31755 202 54 Spam (0.73267327 0.26732673) *
## 3) log_OP_outdegree_mp_div_pagerank_mp_CP_>=14.31755 420 151 NonSpam (0.35952381 0.64047619)
## 6) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_< -0.03106625 120 48 Spam (0.60000000 0.40000000)
## 12) log_OP_outdegree_hp_div_pagerank_hp_CP_>=21.02365 31 2 Spam (0.93548387 0.06451613) *
## 13) log_OP_outdegree_hp_div_pagerank_hp_CP_< 21.02365 89 43 NonSpam (0.48314607 0.51685393)
## 26) L_avgin_of_out_mp>=4.503144 33 10 Spam (0.69696970 0.30303030) *
## 27) L_avgin_of_out_mp< 4.503144 56 20 NonSpam (0.35714286 0.64285714)
## 54) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.040749 35 16 Spam (0.54285714 0.45714286)
## 108) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_< -0.4596983 23 6 Spam (0.73913043 0.26086957) *
## 109) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_>=-0.4596983 12 2 NonSpam (0.16666667 0.83333333) *
## 55) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.040749 21 1 NonSpam (0.04761905 0.95238095) *
## 7) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_>=-0.03106625 300 79 NonSpam (0.26333333 0.73666667)
## 14) L_outdegree_mp>=3.737386 36 12 Spam (0.66666667 0.33333333)
## 28) L_outdegree_mp< 4.2194 24 4 Spam (0.83333333 0.16666667) *
## 29) L_outdegree_mp>=4.2194 12 4 NonSpam (0.33333333 0.66666667) *
## 15) L_outdegree_mp< 3.737386 264 55 NonSpam (0.20833333 0.79166667)
## 30) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.55494 64 24 NonSpam (0.37500000 0.62500000)
## 60) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=2.517176 37 17 Spam (0.54054054 0.45945946)
## 120) log_OP_outdegree_mp_div_pagerank_mp_CP_< 18.89231 19 5 Spam (0.73684211 0.26315789) *
## 121) log_OP_outdegree_mp_div_pagerank_mp_CP_>=18.89231 18 6 NonSpam (0.33333333 0.66666667) *
## 61) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 2.517176 27 4 NonSpam (0.14814815 0.85185185) *
## 31) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.55494 200 31 NonSpam (0.15500000 0.84500000)
## 62) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_>=0.1475373 101 25 NonSpam (0.24752475 0.75247525)
## 124) log_OP_outdegree_mp_div_pagerank_mp_CP_>=20.08747 8 1 Spam (0.87500000 0.12500000) *
## 125) log_OP_outdegree_mp_div_pagerank_mp_CP_< 20.08747 93 18 NonSpam (0.19354839 0.80645161) *
## 63) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_< 0.1475373 99 6 NonSpam (0.06060606 0.93939394) *
Unfortunately, even after optimization (we looped through different depths and types), we are getting 54.3% on the final DS.
trainDS.all.all <- linkTransformedBalanced.PCA
forest.All.All <- randomForest(binnedY ~ ., data = trainDS.all.all, nodesize=1, ntree=120, mtry=5)
plot(forest.All.All)
forestPredict.All.All.Testing <- predict(forest.All.All, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, forestPredict.All.All.Testing)
## forestPredict.All.All.Testing
## NonSpam Spam
## NonSpam 1535 451
## Spam 56 66
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
## [1] 0.5462308
54.6% AUC
naiveBayesModel.PCA <- naiveBayes(binnedY ~ ., data=linkTransformedBalancedTrain.PCA)
naiveBayesPredict.PCA <- predict(naiveBayesModel.PCA, linkTransformedBalancedTest.PCA, type="class")
table(linkTransformedBalancedTest.PCA$binnedY, naiveBayesPredict.PCA)
## naiveBayesPredict.PCA
## NonSpam Spam
## NonSpam 106 17
## Spam 91 52
plotROC(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.PCA == "Spam"))
auc_roc(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.PCA == "Spam"))
## [1] 0.6458471
naiveBayesPredict.Testing.PCA <- predict(naiveBayesModel.PCA, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, naiveBayesPredict.Testing.PCA)
## naiveBayesPredict.Testing.PCA
## NonSpam Spam
## NonSpam 1638 348
## Spam 76 46
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.Testing.PCA == "Spam"))
auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.Testing.PCA == "Spam"))
## [1] 0.5362053
As expected, It performed worse at 53.6% AUC